df = read.csv("/Users/esterperdochova/Desktop/STAT DS/Bike-Sharing-Dataset/hour.csv")
df
df2 = df[df$season %in% c(2,3), ]
df2 = df2[df2$workingday == 1, ]
df2

How does it look? (Hourly Aggregation)

df2 %>%
  mutate(hrFct = fct_rev(as.factor(hr))) %>%
  ggplot(aes(y = hrFct)) +
  geom_density_ridges(
    aes(x = cnt, fill = paste(hrFct, season)), 
    alpha = .8, color = "white"#, from = 0, to = 100
  ) +
  labs(
    x = "Count",
    y = "Hour",
    title = "Distribution of cnt w.r.t. time and season."
  ) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_fill_cyclical(
    breaks = c("Spring", "Summer"),
    labels = c(`Spring` = "Spring", `Summer` = "Summer"),
    values = c("#ff0000", "#0000ff", "#ff8080", "#8080ff"),
    name = "Season", guide = "legend"
  ) +
  coord_cartesian(clip = "off") +
  theme_ridges(grid = FALSE)
## Picking joint bandwidth of 25.3

What is peak evening hours?

-> Let’s find highest running X hour count

query = "
SELECT hr
     , season
     , SUM(cnt) cnt
FROM df2
GROUP BY hr
       , season
"
(df_hour_season = sqldf(query))
peak_finder = function(df, roll_length) {
  temp_list = c()
  for (i in seq(0, 23)) {
    temp_range = seq(i, i+roll_length-1) %% 24
    temp_count = sum(df[df$hr %in% temp_range, ]$cnt)
    temp_list = c(temp_list, temp_count)
  }
  temp_loc = which.max(temp_list) + 1
  peak_range = seq(temp_loc, temp_loc+roll_length-1) %% 24
  return(peak_range)
}

# For spring
(peak_spring = peak_finder(df_hour_season[df_hour_season$season == '2', ], 4))
## [1] 18 19 20 21
# For summer
(peak_summer = peak_finder(df_hour_season[df_hour_season$season == '3', ], 4))
## [1] 18 19 20 21
# Combo:
(peak_combo = peak_finder(df_hour_season, 4))
## [1] 18 19 20 21

Above plot but for peak hours only:

df2[df2$hr %in% peak_spring, ]
df2[df2$hr %in% peak_spring, ] %>%
  mutate(hrFct = fct_rev(as.factor(hr))) %>%
  ggplot(aes(y = hrFct)) +
  geom_density_ridges(
    aes(x = cnt, fill = paste(hrFct, season)), 
    alpha = .8, color = "white"#, from = 0, to = 100
  ) +
  labs(
    x = "Count",
    y = "Hour",
    title = "Distribution of cnt w.r.t. time and season."
  ) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_fill_cyclical(
    breaks = c("Spring", "Summer"),
    labels = c(`Spring` = "Spring", `Summer` = "Summer"),
    values = c("#ff0000", "#0000ff", "#ff8080", "#8080ff"),
    name = "Season", guide = "legend"
  ) +
  coord_cartesian(clip = "off") +
  theme_ridges(grid = FALSE)
## Picking joint bandwidth of 42.5

Are the distributions normal?

visualise_df = function(df, name) {
  name          = gsub(" ", "\n", name)
  n_breaks      = max(min(nrow(df)/200, 40), 20)
  temp_dens     = density(df$cnt, adjust = 2)
  temp_dens_log = density(log(df$cnt), adjust = 2)
  
  hist(df$cnt,
    col    = "floralwhite",
    breaks = n_breaks,
    prob   = TRUE,
    xlab   = "Cnt",
    main   = "Hist.: Cnt")
  lines(temp_dens,
    col = "firebrick4")
  legend("topright", legend = name, bty = "n")

  hist(log(df$cnt),
    col    = "floralwhite",
    breaks = n_breaks,
    prob   = TRUE,
    xlab   = "ln(Cnt)",
    main   = "Hist.: ln(Cnt)")
  lines(temp_dens_log,
    col = "firebrick4")
  legend("topright", legend = name, bty = "n")
}
# For spring:
df_spring = df2[df2$season == 2, ]
df_spring = df_spring[df_spring$hr %in% peak_spring, ]
shapiro.test(df_spring$cnt)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_spring$cnt
## W = 0.96189, p-value = 3.009e-10
descdist(df_spring$cnt, boot = 1000)

## summary statistics
## ------
## min:  22   max:  868 
## median:  318.5 
## mean:  351.2266 
## estimated sd:  186.9774 
## estimated skewness:  0.630501 
## estimated kurtosis:  2.857972
(dist_object = fitdist(df_spring$cnt, "norm"))
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters:
##      estimate Std. Error
## mean 351.2266   8.255288
## sd   186.7947   5.837337
plot(dist_object)

# For summer:
df_summer = df2[df2$season == 3, ]
df_summer = df_summer[df_summer$hr %in% peak_summer, ]
shapiro.test(df_summer$cnt)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_summer$cnt
## W = 0.95379, p-value = 9.655e-12
descdist(df_summer$cnt, boot = 1000)

## summary statistics
## ------
## min:  38   max:  977 
## median:  381 
## mean:  418.0401 
## estimated sd:  190.0553 
## estimated skewness:  0.7097991 
## estimated kurtosis:  2.991612
(dist_object = fitdist(df_summer$cnt, "norm"))
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters:
##      estimate Std. Error
## mean 418.0401   8.294648
## sd   189.8739   5.865202
plot(dist_object)

par(mar = c(4, 3, 3, 3), mgp = c(2, 0.5,0))
graphics::layout(mat = matrix(1:4, nrow = 2, ncol = 2, byrow = F),
                 heights = 1, widths = 1)

visualise_df(df_spring, "Spring")
visualise_df(df_summer, "Summer")

Are the two seasons from the same population?

ks.test(df_spring$cnt, df_summer$cnt)
## Warning in ks.test.default(df_spring$cnt, df_summer$cnt): p-value will be
## approximate in the presence of ties
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  df_spring$cnt and df_summer$cnt
## D = 0.16761, p-value = 9.591e-07
## alternative hypothesis: two-sided

How do they differ?

df3 = df2
df3$y_bool = df3$season - 2
df3 = df3[, -3]
df3$dteday = as.numeric(as.Date(df3$dteday))

mod1 = glm(y_bool ~ ., data = df3, family = binomial(link = 'logit'))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod1)
## 
## Call:
## glm(formula = y_bool ~ ., family = binomial(link = "logit"), 
##     data = df3)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -4.971e-04  -2.000e-08   2.000e-08   2.000e-08   4.937e-04  
## 
## Coefficients: (3 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  2.398e+07  4.739e+08   0.051    0.960
## instant      6.750e+01  1.330e+03   0.051    0.960
## dteday      -1.601e+03  3.164e+04  -0.051    0.960
## yr          -4.966e+03  9.478e+04  -0.052    0.958
## mnth        -1.640e+02  4.299e+03  -0.038    0.970
## hr          -6.752e+01  1.332e+03  -0.051    0.960
## holiday             NA         NA      NA       NA
## weekday      1.438e+01  3.724e+02   0.039    0.969
## workingday          NA         NA      NA       NA
## weathersit   2.671e-01  5.447e+02   0.000    1.000
## temp         1.790e+01  2.198e+04   0.001    0.999
## atemp        1.192e+01  2.053e+04   0.001    1.000
## hum          9.989e+00  6.216e+03   0.002    0.999
## windspeed    3.538e+00  4.769e+03   0.001    0.999
## casual      -2.598e-02  2.916e+01  -0.001    0.999
## registered   2.792e-03  3.285e+00   0.001    0.999
## cnt                 NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8.6038e+03  on 6206  degrees of freedom
## Residual deviance: 8.7607e-06  on 6193  degrees of freedom
## AIC: 28
## 
## Number of Fisher Scoring iterations: 25
df3 = as.matrix(df3)

bst = xgboost(df3[, -17], df3[, 17], nrounds = 50,
              eta = 0.2, max_depth = 5, subsample = .5, verbose = 0)

xgb.plot.shap(df3[, -17], model = bst, top_n = 4, n_col = 2)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2.97
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.03
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.1209

xgb.ggplot.shap.summary(df3[, -17], model = bst)